Loading necessary packages.

Loading data.

Looking at data:

dim(noc_regions)
## [1] 230   3
summary(athlete_events)
##        ID             Name               Sex                 Age       
##  Min.   :     1   Length:271116      Length:271116      Min.   :10.00  
##  1st Qu.: 34643   Class :character   Class :character   1st Qu.:21.00  
##  Median : 68205   Mode  :character   Mode  :character   Median :24.00  
##  Mean   : 68249                                         Mean   :25.56  
##  3rd Qu.:102097                                         3rd Qu.:28.00  
##  Max.   :135571                                         Max.   :97.00  
##                                                         NA's   :9474   
##      Height          Weight          Team               NOC           
##  Min.   :127.0   Min.   : 25.0   Length:271116      Length:271116     
##  1st Qu.:168.0   1st Qu.: 60.0   Class :character   Class :character  
##  Median :175.0   Median : 70.0   Mode  :character   Mode  :character  
##  Mean   :175.3   Mean   : 70.7                                        
##  3rd Qu.:183.0   3rd Qu.: 79.0                                        
##  Max.   :226.0   Max.   :214.0                                        
##  NA's   :60171   NA's   :62875                                        
##     Games                Year         Season              City          
##  Length:271116      Min.   :1896   Length:271116      Length:271116     
##  Class :character   1st Qu.:1960   Class :character   Class :character  
##  Mode  :character   Median :1988   Mode  :character   Mode  :character  
##                     Mean   :1978                                        
##                     3rd Qu.:2002                                        
##                     Max.   :2016                                        
##                                                                         
##     Sport              Event              Medal          
##  Length:271116      Length:271116      Length:271116     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
## 

Integer encoding the NOC data for possible later usage:

NOC_Int <- c(as.numeric(as.factor(noc_regions$NOC)))
noc_regions$NOC.Int <- NOC_Int
noc_regions

Looking at oldest and youngest athletes, finding athletes with NA age:

#ages <- athlete_events$Age
#max(athlete_events$Age, na.rm = TRUE)
oldAge <- subset(athlete_events, Age == max(athlete_events$Age, na.rm = TRUE))
youngAge <- subset(athlete_events, Age == min(athlete_events$Age, na.rm = TRUE))
list(oldAge,youngAge)
## [[1]]
##            ID                   Name Sex Age Height Weight          Team
## 257055 128719 John Quincy Adams Ward   M  97     NA     NA United States
##        NOC       Games Year Season      City            Sport
## 257055 USA 1928 Summer 1928 Summer Amsterdam Art Competitions
##                                              Event Medal
## 257055 Art Competitions Mixed Sculpturing, Statues  <NA>
## 
## [[2]]
##           ID               Name Sex Age Height Weight
## 142883 71691 Dimitrios Loundras   M  10     NA     NA
##                                 Team NOC       Games Year Season   City
## 142883 Ethnikos Gymnastikos Syllogos GRE 1896 Summer 1896 Summer Athina
##             Sport                                 Event  Medal
## 142883 Gymnastics Gymnastics Men's Parallel Bars, Teams Bronze
shyAthletes <- subset(athlete_events, is.na(Age))
head(shyAthletes)

Counts of certain columns:

loadPkg("dplyr")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:xts':
## 
##     first, last
## The following object is masked from 'package:GGally':
## 
##     nasa
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
sexC <- athlete_events %>% count(Sex)
ageC <- athlete_events %>% count(Age)
nameC <- athlete_events %>% count(Name)
medalC <- athlete_events %>% count(Medal)
medalC

Barchart of medal data:

medalDat <- subset(athlete_events, !is.na(Medal))
bp1 <- ggplot(athlete_events, aes(x=Medal), color = Team) + geom_bar(aes(y=(..count..)/sum(..count..)))
bp2 <- ggplot(medalDat, aes(x=Medal), color = Team) + geom_bar(aes(y=(..count..)/sum(..count..)))

grid.arrange(bp1, bp2, nrow = 1)

Integer encoding data so that it can be include in checking correlation:

mDat <- medalDat %>% mutate(Medal = replace(Medal, Medal == "Gold", 1))
mDat <- mDat %>% mutate(Medal = replace(Medal, Medal == "Silver", 2))
mDat <- mDat %>% mutate(Medal = replace(Medal, Medal == "Bronze", 3))
mDat <- mDat %>% mutate(Sex = replace(Sex, Sex == "F", 1))
mDat <- mDat %>% mutate(Sex = replace(Sex, Sex == "M", 2))
mNOCi <- c(as.numeric(as.factor(mDat$NOC)))
mSPi <- c(as.numeric(as.factor(mDat$Sport)))
mDat$NOC.Int <- mNOCi
mDat$Sport.Int <- mSPi
head(mDat)

Columns that I want to keep for correlation plot:

keeps <- c("ID","Sex","Age","Height","Weight","NOC.Int","Sport.Int","Medal")
dmDat <- mDat[ , keeps, drop = FALSE]
dmDat[,] <- sapply(dmDat[,], as.numeric)
head(dmDat)

Correlation plot:

loadPkg("digest")
corr = cor(dmDat, use="pairwise.complete.obs") 
c1 <- corrplot.mixed(corr, title="Correlation Matrix", mar=c(0,0,1,0))

c2 <- corrplot(corr, method="pie", type = "upper", order = "hclust", tl.col = "black", tl.srt = 45)

#h1 <- heatmap(x=corr, symm=TRUE)

p.mat <- cor_pmat(dmDat)
gcorr1 <- ggcorrplot(corr, method = "circle", hc.order = TRUE)
gcorr2 <- ggcorrplot(corr, method = "square", hc.order = TRUE)
grid.arrange(gcorr1, gcorr2, nrow = 1)

Essentially very similar to above, just with package PerformanceAnalytics:

chart.Correlation(dmDat, histogram=TRUE, pch=19)

In the above plot: * The distribution of each variable is shown on the diagonal. * On the bottom of the diagonal : the bivariate scatter plots with a fitted line are displayed * On the top of the diagonal : the value of the correlation plus the significance level as stars * Each significance level is associated to a symbol : p-values(0, 0.001, 0.01, 0.05, 0.1, 1) <=> symbols(“***”, “**”, “*”, “.”, " “)

Starting to look at names for analysis:

loadPkg("tidyverse")
## -- Attaching packages --------------------------------------- tidyverse 1.2.1 --
## v tibble  2.1.1     v purrr   0.3.2
## v tibble  2.1.1     v forcats 0.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::combine() masks gridExtra::combine()
## x dplyr::filter()  masks stats::filter()
## x dplyr::first()   masks xts::first()
## x dplyr::lag()     masks stats::lag()
## x dplyr::last()    masks xts::last()
loadPkg("rvest")
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:purrr':
## 
##     pluck
## The following object is masked from 'package:readr':
## 
##     guess_encoding
loadPkg("magrittr")
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
loadPkg("ggmap")
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
## 
## Attaching package: 'ggmap'
## The following object is masked from 'package:magrittr':
## 
##     inset
first.inst.NOC <- first.instance %>% count(NOC)
first.inst.NOC <- first.inst.NOC[order(first.inst.NOC$NOC),]
names(first.inst.NOC)[names(first.inst.NOC) == "n"] <- "rank"
first.inst.NOC$country = noc_regions$region
first.inst.NOC
map.world <- map_data("world")
map.world_joined <- left_join(map.world, first.inst.NOC, by = c('region' = 'country'))
map.world_joined <- map.world_joined %>% mutate(fill_flg = ifelse(is.na(rank),F,T))

register_google(key = "AIzaSyCv1WHHXGBFuIRJBQ3G-5Ijuyft_8sKeHc")
df.country_points <- data.frame(country = c("Singapore","luxembourg"),stringsAsFactors = F)
glimpse(df.country_points)
## Observations: 2
## Variables: 1
## $ country <chr> "Singapore", "luxembourg"
geocode.country_points <- geocode(df.country_points$country)
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=Singapore&key=xxx-5Ijuyft_8sKeHc
## Source : https://maps.googleapis.com/maps/api/geocode/json?address=luxembourg&key=xxx-5Ijuyft_8sKeHc
## "luxembourg" not uniquely geocoded, using "luxembourg"
df.country_points <- cbind(df.country_points,geocode.country_points)
map.world_joined
ggplot() + 
  geom_polygon( data=map.world_joined, aes(x=long, y=lat, group=group),
                color="black", fill="lightblue" )

w <- ggplot()
w <- w + geom_polygon( data=map.world_joined, 
          aes(x=long, y=lat, group=group, fill = rank), 
          color="white", size = 0.2) 
w

w <- w + scale_fill_continuous(name="Number of Olympic Athletes", 
            low = "lightblue", high = "darkblue",limits = c(0,9700), 
            breaks=c(970,1940,2910,3880,4850,5820,6790,7760,8730), na.value = "white") +

          labs(title="Olympic Athletes from Around the Globe")
w

wordcloud(words = first.names.count$First.Name, freq = first.names.count$n, min.freq = 190, max.words=100, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Paired"))

wordcloud(words = lastname.edit$Last.Name, freq = lastname.edit$n, min.freq = 50, max.words=100, random.order=FALSE, rot.per=0.35, colors=brewer.pal(8, "Paired"))
## Warning in wordcloud(words = lastname.edit$Last.Name, freq =
## lastname.edit$n, : Robinson could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = lastname.edit$Last.Name, freq =
## lastname.edit$n, : Lewis could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = lastname.edit$Last.Name, freq =
## lastname.edit$n, : Costa could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = lastname.edit$Last.Name, freq =
## lastname.edit$n, : Khan could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = lastname.edit$Last.Name, freq =
## lastname.edit$n, : Ramrez could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = lastname.edit$Last.Name, freq =
## lastname.edit$n, : Nagy could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = lastname.edit$Last.Name, freq =
## lastname.edit$n, : Clark could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = lastname.edit$Last.Name, freq =
## lastname.edit$n, : Fischer could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = lastname.edit$Last.Name, freq =
## lastname.edit$n, : Wei could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = lastname.edit$Last.Name, freq =
## lastname.edit$n, : Evans could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = lastname.edit$Last.Name, freq =
## lastname.edit$n, : Roberts could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = lastname.edit$Last.Name, freq =
## lastname.edit$n, : Souza could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = lastname.edit$Last.Name, freq =
## lastname.edit$n, : Gutirrez could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(words = lastname.edit$Last.Name, freq =
## lastname.edit$n, : Meyer could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = lastname.edit$Last.Name, freq =
## lastname.edit$n, : Weber could not be fit on page. It will not be plotted.

val2 = which.max(nchar(athlete_events$Name))
athlete_events[val2,]

Looking at height/weight info:

loadPkg("plyr")
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:purrr':
## 
##     compact
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
weightDat <- subset(athlete_events, !is.na(Weight))
heightDat <- subset(athlete_events, !is.na(Height))
muW <- ddply(weightDat, "Sex", summarise, grp.mean=mean(Weight), grp.variance=var(Weight), grp.standarddev=sd(Weight))
muH <- ddply(heightDat, "Sex", summarise, grp.mean=mean(Height), grp.variance=var(Height), grp.standarddev=sd(Height))
muW
muH

Standard histograms forheight/weight:

p1 <- ggplot(weightDat, aes(x=Weight, color=Sex, fill=Sex)) + geom_histogram(alpha=0.5, binwidth = 5, aes(y=..density..), position="dodge") + geom_density(alpha=0.6) + geom_vline(data=muW, aes(xintercept=grp.mean, color=Sex), linetype="dashed") +
labs(title="Weight histogram plot",x="Weight(kg)", y = "Density")

p1c <- p1 + scale_color_brewer(palette="Accent")+ scale_fill_brewer(palette="Accent") + theme_minimal()+theme(legend.position="top")

p2 <- ggplot(heightDat, aes(x=Height, color=Sex, fill=Sex)) + geom_histogram(binwidth = 5, alpha=0.5, aes(y=..density..), position="dodge") + geom_density(alpha=0.6) + geom_vline(data=muH, aes(xintercept=grp.mean, color=Sex), linetype="dashed") +
labs(title="Height histogram plot",x="Height(cm)", y = "Density")

p2c <- p2 + scale_color_brewer(palette="Accent")+ scale_fill_brewer(palette="Accent") + 
  theme_minimal()+theme(legend.position="top")

grid.arrange(p1c, p2c, nrow = 2)

weightDat
outlierKD2 <- function(df, var, rm=FALSE, title, colVal) { 
    #' Original outlierKD functino by By Klodian Dhana,
    #' https://www.r-bloggers.com/identify-describe-plot-and-remove-the-outliers-from-the-dataset/
    #' Modified to have third argument for removing outliers inwtead of interactive prompt, 
    #' and after removing outlier, original df will not be changed. The function returns the new df, 
    #' which can be saved as original df name if desired.
    #' Check outliers, and option to remove them, save as a new dataframe. 
    #' @param df The dataframe.
    #' @param var The variable in the dataframe to be checked for outliers
    #' @param rm Boolean. Whether to remove outliers or not.
    #' @return The dataframe with outliers replaced by NA if rm==TRUE, or df if nothing changed
    #' @examples
    #' outlierKD2(mydf, height, FALSE)
    #' mydf = outlierKD2(mydf, height, TRUE)
    #' mydfnew = outlierKD2(mydf, height, TRUE)
    dt = df # duplicate the dataframe for potential alteration
    var_name <- eval(substitute(var),eval(dt))
    na1 <- sum(is.na(var_name))
    m1 <- mean(var_name, na.rm = T)
    par(mfrow=c(2, 3), oma=c(0,0,3,0))
    boxplot(var_name, main=paste("With outliers for", title), xlab = title, col = colVal)
    hist(var_name, main=paste("With outliers for", title), xlab=title, ylab="frequency", col = colVal, density = 10)
    qqnorm(var_name, main=paste("Q-Q plot with outliers for", title), col = colVal)
    qqline(var_name)
    outlier <- boxplot.stats(var_name)$out
    mo <- mean(outlier)
    var_name <- ifelse(var_name %in% outlier, NA, var_name)
    boxplot(var_name, main=paste("Without outliers for", title), xlab = title, col = colVal)
    hist(var_name, main=paste("Without outliers for", title), xlab=title, ylab="frequency", col = colVal, density = 10)
    qqnorm(var_name, main=paste("Q-Q plot without outliers for", title), col = colVal)
    qqline(var_name)
    title(paste("Outlier Check for ", title), outer=TRUE)
    na2 <- sum(is.na(var_name))
    cat("Outliers identified:", na2 - na1, "\n")
    cat("Propotion (%) of outliers:", round((na2 - na1) / sum(!is.na(var_name))*100, 1), "\n")
    cat("Mean of the outliers:", round(mo, 2), "\n")
    m2 <- mean(var_name, na.rm = T)
    cat("Mean without removing outliers:", round(m1, 2), "\n")
    cat("Mean if we remove outliers:", round(m2, 2), "\n")
    
    # response <- readline(prompt="Do you want to remove outliers and to replace with NA? [yes/no]: ")
    # if(response == "y" | response == "yes"){
    if(rm){
        dt[as.character(substitute(var))] <- invisible(var_name)
        #assign(as.character(as.list(match.call())$dt), dt, envir = .GlobalEnv)
        cat("Outliers successfully removed", "\n")
        return(invisible(dt))
    } else {
        cat("Nothing changed", "\n")
        return(invisible(df))
    }
}
weightDat.noOut = outlierKD2(weightDat, Weight, TRUE, "weight (kg)", "red")

## Outliers identified: 3237 
## Propotion (%) of outliers: 1.6 
## Mean of the outliers: 118.11 
## Mean without removing outliers: 70.7 
## Mean if we remove outliers: 69.95 
## Outliers successfully removed
heightDat.noOut = outlierKD2(heightDat, Height, TRUE, "height (cm)", "blue")

## Outliers identified: 1338 
## Propotion (%) of outliers: 0.6 
## Mean of the outliers: 174.08 
## Mean without removing outliers: 175.34 
## Mean if we remove outliers: 175.35 
## Outliers successfully removed
g1 <- ggplot(weightDat, aes(x=Sex, y=Weight, color = Sex)) + geom_boxplot() + geom_boxplot(outlier.colour="black", outlier.shape=16, outlier.size=1) + labs(title="Weight Boxplot of Olympic Athletes\nWith Outliers", x="sex", y="weight (kg)")
g2 <- ggplot(weightDat) + geom_qq(aes(sample = Weight), color = "red") + geom_qq_line(aes(sample = Weight)) + theme_minimal() + labs(title="Q-Q Plot of Weight of Olympic Athletes\nWith Outliers")

g3 <- ggplot(weightDat.noOut, aes(x=Sex, y=Weight, color = Sex)) + geom_boxplot() + geom_boxplot(outlier.colour="black", outlier.shape=16, outlier.size=1) + labs(title="Weight Boxplot of Olympic Athletes\nWithout Outliers", x="sex", y="weight (kg)")
g4 <- ggplot(weightDat.noOut) + geom_qq(aes(sample = Weight), color = "red") + geom_qq_line(aes(sample = Weight)) + theme_minimal() + labs(title="Q-Q Plot of Weight of Olympic Athletes\nWithout Outliers")

g5 <- ggplot(heightDat, aes(x=Sex, y=Height, color = Sex)) + geom_boxplot() + geom_boxplot(outlier.colour="black", outlier.shape=16, outlier.size=1) + labs(title="Height Boxplot of Olympic Athletes\nWith Outliers", x="sex", y="height (cm)")
g6 <- ggplot(weightDat) + geom_qq(aes(sample = Height), color = "red") + geom_qq_line(aes(sample = Height)) + theme_minimal() + labs(title="Q-Q Plot of Height of Olympic Athletes\nWith Outliers")

g7 <- ggplot(heightDat.noOut, aes(x=Sex, y=Height, color = Sex)) + geom_boxplot() + geom_boxplot(outlier.colour="black", outlier.shape=16, outlier.size=1) + labs(title="Height Boxplot of Olympic Athletes \nWithout Outliers", x="sex", y="height (cm)")
g8 <- ggplot(heightDat.noOut) + geom_qq(aes(sample = Height), color = "red") + geom_qq_line(aes(sample = Height)) + theme_minimal() + labs(title="Q-Q Plot of Height of Olympic Athletes \nWithout Outliers")

grid.arrange(g1,g2,g3,g4, ncol = 2, nrow = 2)
## Warning: Removed 3237 rows containing non-finite values (stat_boxplot).

## Warning: Removed 3237 rows containing non-finite values (stat_boxplot).
## Warning: Removed 3237 rows containing non-finite values (stat_qq).
## Warning: Removed 3237 rows containing non-finite values (stat_qq_line).

grid.arrange(g5,g6,g7,g8, ncol = 2, nrow = 2)
## Warning: Removed 1388 rows containing non-finite values (stat_qq).
## Warning: Removed 1388 rows containing non-finite values (stat_qq_line).
## Warning: Removed 1338 rows containing non-finite values (stat_boxplot).

## Warning: Removed 1338 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1338 rows containing non-finite values (stat_qq).
## Warning: Removed 1338 rows containing non-finite values (stat_qq_line).

comment comment comment